El fichero de datos contiente 145 variables. Muchas de ellas se pueden omitir porque son las respuestas a cada de una de las preguntas de tres cuestionarios diferentes y en este estudio se utilizarán sólo las puntuaciones estandarizadas del cuestionario DMSES completo y el de cada uno de su cuatro apartados.
También se descartarán las variables que son la categorización de otras variables del fichero. Si alguna categorización o agrupación de valores fuera necesaria se creará en su debido momento.
Por otra lado se crearán variables nuevas:
library(MASS)
library(dplyr)
library(ggplot2)
library(mgcv)
library(gnm)
library(GGally)
library(gmodels)
library(Hmisc)
library(caret)
library(ROCR)
library(gridExtra)
library(corrplot)
library(kableExtra)
library(faraway)
library(ggpubr)
library(klaR)
setwd("C:/Users/ehm24/Documents/master/TFM")
datos.raw <- read.csv("./datos/MontiFinal.csv")
# eliminamos las preguntas de los cuestionarios, categorizaciones de variables y
# otras variables no relevantes para el estudio
datos <- datos.raw[,-c(1,2,15,22,26:84,113:131,137:142,144:145)]
# definimos los factores
# Patient identifier
datos$id.code <- as.factor(datos$id.code)
# Hospital identifier
datos$hospid <- factor(datos$hospid,levels=c(1,2,3,4),
labels=c("phupaman","srinakarin","wegaronrat","chula"))
# Sex
datos$gender <- factor(datos$gender,levels=c(1,2),
labels=c("Hombre","Mujer"))
# Marital status
datos$mstatus <- factor(datos$mstatus,levels=c(1,2,3,4,5),
labels=c("Soltero","´Casado",
"Viudo","Divorciado","Separado"))
# Education level
datos$edu <- factor(datos$edu,
levels=c(1,2,3,4,5,6),
labels=c("Sin estudios" , "Primarios", "Secundarios",
"Grado", "Master", "Grado Superior"))
# Religion
datos$religion <- factor(datos$religion,levels=c(1,2,3,4),
labels=c("Budismo", "Islam", "Cristianismo","Otra religión"))
# Income
datos$income <- factor(datos$income, levels=c(1,2,3,4,5,6),
labels=c("menos de 4,999 bath", "5,000-9,999 bath",
"10,000-14,999 bath", "15,000-19,999 bath",
"20,000-24,999 bath", "más de 25,000 bath"))
# Family history of T2DM
datos$famhx <- factor(datos$famhx,levels=c(0,1),labels=c("No", "Sí"))
# Comorbid disease
datos$comob <- factor(datos$comob,levels=c(0,1),labels=c("No", "Sí"))
# Comorbidity lipidemia
datos$comlip <- factor(datos$comlip,levels=c(0,1),labels=c("No", "Sí"))
# Comorbidity hypertension
datos$comht <- factor(datos$comht,levels=c(0,1),labels=c("No", "Sí"))
# Comorbidity coronary heart disease
datos$comchd <- factor(datos$comchd,levels=c(0,1),labels=c("No", "Sí"))
# Comorbidity kidney disease
datos$comkid <- factor(datos$comkid,levels=c(0,1),labels=c("No", "Sí"))
# Other comorbidity
datos$comoth <- factor(datos$comoth,levels=c(0,1),labels=c("No", "Sí"))
# Type of Treatment of DM
datos$dmrx <- factor(datos$dmrx,
levels=c(1,2,3,4),
labels=c("Sin medicación","Hipoglicémico oral",
"Insulina","Ambos"))
# Smoking
datos$smk <- factor(datos$smk,levels=c(1,2,3),
labels=c("Sí","Exfumador","No"))
# Alcohol
datos$alcohol <- factor(datos$alcohol,levels=c(1,2,3),
labels=c("Sí","Exbebedor","No"))
# Diabetes Complication
datos$compli <- factor(datos$compli,levels=c(0,1),labels=c("No", "Sí"))
# Cerebrovascular Accident
datos$cva <- factor(datos$cva,levels=c(0,1),labels=c("No", "Sí"))
# Cerebral Infraction
datos$cereinfrac <- factor(datos$cereinfrac,levels=c(0,1),labels=c("No", "Sí"))
# Ischemic Stroke
datos$ishemic <- factor(datos$ishemic,levels=c(0,1),labels=c("No", "Sí"))
# Stroke, Not specify
datos$stroke <- factor(datos$stroke,levels=c(0,1),labels=c("No", "Sí"))
# Cerebral Hemorrhage
datos$cerebhem <- factor(datos$cerebhem,levels=c(0,1),labels=c("No", "Sí"))
# Transient Ischemic Attack
datos$tia <- factor(datos$tia,levels=c(0,1),labels=c("No", "Sí"))
# Angina pectoris
datos$angia <- factor(datos$angia,levels=c(0,1),labels=c("No", "Sí"))
# Congestive Heart Failure
datos$chf <- factor(datos$chf,levels=c(0,1),labels=c("No", "Sí"))
# Myocardial Infraction
datos$mi <- factor(datos$mi,levels=c(0,1),labels=c("No", "Sí"))
# Coronary Revascularization
datos$cororevas <- factor(datos$cororevas,levels=c(0,1),labels=c("No", "Sí"))
# Peripheral Arterial Disease
datos$pad <- factor(datos$pad,levels=c(0,1),labels=c("No", "Sí"))
# Neuropathy
datos$neuropath <- factor(datos$neuropath,levels=c(0,1),labels=c("No", "Sí"))
# Renal Insufficiency
datos$renal <- factor(datos$renal,levels=c(0,1),labels=c("No", "Sí"))
# Diabetes Nephropathy
datos$dn <- factor(datos$dn,levels=c(0,1),labels=c("No", "Sí"))
# Diabetes Retinopathy
datos$dr <- factor(datos$dr,levels=c(0,1),labels=c("No", "Sí"))
# Other complication
datos$othcomp <- factor(datos$othcomp,levels=c(0,1),labels=c("No", "Sí"))
# creación de variables derivadas
# alerta: 1-Sí si hba1c > 7% (diabetes no controlada)
datos$alerta <- factor(ifelse(datos$hba1c > 7,1,0),
levels=c(0,1),
labels=c("No", "Sí"))
# BMI
datos$bmi <- datos$weight/(datos$height/100)**2
# sobrepeso, BMI >25
datos$sobrepeso <- factor(ifelse(datos$bmi > 25,1,0),
levels=c(0,1),labels=c("No", "Sí"))
# obesidad, BMI > 30
datos$obesidad <- factor(ifelse(datos$bmi > 30,1,0),
levels=c(0,1),labels=c("No", "Sí"))
# transformación a variable tipo date de todas las variables fecha
# estas variables son útiles para poder determinar que datos tienen
# información actualizada
datos$fecha.hba1c <- as.Date(datos$hba1cdate,"%d/%m/%y")
datos$fecha.ldl <- as.Date(datos$ldldate,"%d/%m/%y")
datos$fecha.hdl <- as.Date(datos$hdldate,"%d/%m/%y")
datos$fecha.trig <- as.Date(datos$trigdate,"%d/%m/%y")
datos$fecha.bp <- as.Date(datos$bpdate,"%d/%m/%y")
# eliminación de variables no necesarias
drop.cols <- c("weight","height","hba1cdate","ldldate","hdldate","trigdate","bpdate")
datos <- datos %>% select(-all_of(drop.cols))
Se tiene sólo datos faltantes en dos variables tipo fecha (fecha.ldl y fecha.bp) y en sólo dos pacientes.
# datos faltantes
datos[!complete.cases(datos),]
## id.code hospid gender mstatus age edu religion
## 225 225 wegaronrat Hombre ´Casado 56 Secundarios Islam
## 352 352 wegaronrat Hombre Viudo 70 Primarios Budismo
## income dmdura famhx comob comlip comht comchd comkid comoth
## 225 menos de 4,999 bath 4 No Sí Sí Sí No No No
## 352 menos de 4,999 bath 5 No Sí Sí Sí No No No
## dmrx smk alcohol hba1c ldl hdl trig sbp dbp compli
## 225 Hipoglicémico oral No No 6.0 99 68 111 137 95 No
## 352 Ambos Exfumador Exbebedor 6.7 76 46 72 133 22 Sí
## cva cereinfrac ishemic stroke cerebhem tia angia chf mi cororevas pad
## 225 No No No No No No No No No No No
## 352 No No Sí No No No No No No No No
## neuropath renal dn dr othcomp DMSES.Diet DMSES.Monitor DMSES.Physical
## 225 No No No No No 0.3336945 0.1225384 -0.6551139
## 352 No No No No No 0.1123347 0.3966739 0.3133142
## DMSES.Regimen DMSES.Total DK.10 alerta bmi sobrepeso obesidad
## 225 -0.5854147 -0.7842956 4.685086 No 33.62209 Sí Sí
## 352 0.1183689 0.9406917 4.550101 No 22.65625 No No
## fecha.hba1c fecha.ldl fecha.hdl fecha.trig fecha.bp
## 225 2016-02-15 <NA> 2015-06-18 2015-06-18 2016-05-16
## 352 2016-01-15 2016-05-13 2016-05-23 2016-05-23 <NA>
Las variables continuas son:
age
dmdura
hba1c
ldl
hdl
trig
sbp
dbp
DMSES.Diet
DMSES.Monitor
DMSES.Physical
DMSES.Regimen
DMSES.Total
DK.10
flag_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
z <-as.factor(as.integer(is.na(y)))
return(z)
}
datos$age_out <- flag_outliers(datos$age)
datos$dmdura_out <- flag_outliers(datos$dmdura)
datos$hba1c_out <- flag_outliers(datos$hba1c)
datos$ldl_out <- flag_outliers(datos$ldl)
datos$hdl_out <- flag_outliers(datos$hdl)
datos$trig_out <- flag_outliers(datos$trig)
datos$sbp_out <- flag_outliers(datos$sbp)
datos$dbp_out <- flag_outliers(datos$dbp)
datos$DMSES.Diet_out <- flag_outliers(datos$DMSES.Diet)
datos$DMSES.Monitor_out <- flag_outliers(datos$DMSES.Monitor)
datos$DMSES.Physical_out <- flag_outliers(datos$DMSES.Physical)
datos$DMSES.Regimen_out <- flag_outliers(datos$DMSES.Regimen)
datos$DMSES.Total_out <- flag_outliers(datos$DMSES.Total)
datos$DK.10_out <- flag_outliers(datos$DK.10)
datos$bmi_out <- flag_outliers(datos$bmi)
summary(datos)
## id.code hospid gender mstatus age
## 1 : 1 phupaman : 60 Hombre:208 Soltero : 57 Min. :26.00
## 2 : 1 srinakarin: 77 Mujer :492 ´Casado :462 1st Qu.:59.00
## 3 : 1 wegaronrat:241 Viudo :165 Median :65.00
## 4 : 1 chula :322 Divorciado: 13 Mean :65.16
## 5 : 1 Separado : 3 3rd Qu.:73.00
## 6 : 1 Max. :95.00
## (Other):694
## edu religion income
## Sin estudios : 47 Budismo :543 menos de 4,999 bath:318
## Primarios :381 Islam :152 5,000-9,999 bath : 95
## Secundarios :146 Cristianismo : 5 10,000-14,999 bath : 86
## Grado : 99 Otra religión: 0 15,000-19,999 bath : 48
## Master : 25 20,000-24,999 bath : 48
## Grado Superior: 2 más de 25,000 bath :105
##
## dmdura famhx comob comlip comht comchd comkid comoth
## Min. : 4.00 No:370 No: 42 No: 95 No: 97 No:662 No:512 No:699
## 1st Qu.: 7.00 Sí:330 Sí:658 Sí:605 Sí:603 Sí: 38 Sí:188 Sí: 1
## Median :10.00
## Mean :13.53
## 3rd Qu.:20.00
## Max. :45.00
##
## dmrx smk alcohol hba1c
## Sin medicación : 12 Sí : 23 Sí : 42 Min. : 2.000
## Hipoglicémico oral:409 Exfumador: 88 Exbebedor: 89 1st Qu.: 6.400
## Insulina : 94 No :589 No :569 Median : 7.100
## Ambos :185 Mean : 7.579
## 3rd Qu.: 8.300
## Max. :15.000
##
## ldl hdl trig sbp
## Min. : 8.7 Min. : 17.00 Min. : 29.30 Min. : 93.0
## 1st Qu.: 81.0 1st Qu.: 41.88 1st Qu.: 95.75 1st Qu.:123.0
## Median : 97.0 Median : 50.00 Median :127.00 Median :133.0
## Mean :100.8 Mean : 50.75 Mean :149.02 Mean :134.8
## 3rd Qu.:116.0 3rd Qu.: 58.00 3rd Qu.:175.00 3rd Qu.:145.0
## Max. :221.0 Max. :116.00 Max. :999.00 Max. :217.0
##
## dbp compli cva cereinfrac ishemic stroke cerebhem
## Min. : 22.00 No:429 No:699 No:700 No:688 No:698 No:700
## 1st Qu.: 65.00 Sí:271 Sí: 1 Sí: 0 Sí: 12 Sí: 2 Sí: 0
## Median : 73.00
## Mean : 73.02
## 3rd Qu.: 80.00
## Max. :112.00
##
## tia angia chf mi cororevas pad neuropath renal
## No:699 No:700 No:692 No:675 No:699 No:696 No:695 No:582
## Sí: 1 Sí: 0 Sí: 8 Sí: 25 Sí: 1 Sí: 4 Sí: 5 Sí:118
##
##
##
##
##
## dn dr othcomp DMSES.Diet DMSES.Monitor
## No:619 No:561 No:699 Min. :-1.45946 Min. :-0.84225
## Sí: 81 Sí:139 Sí: 1 1st Qu.:-0.53693 1st Qu.:-0.23065
## Median :-0.06537 Median :-0.03718
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.52029 3rd Qu.: 0.24502
## Max. : 1.03434 Max. : 0.55056
##
## DMSES.Physical DMSES.Regimen DMSES.Total DK.10
## Min. :-1.28709 Min. :-2.86027 Min. :-5.6289 Min. : 0.000
## 1st Qu.:-0.28906 1st Qu.: 0.07856 1st Qu.:-1.0268 1st Qu.: 4.224
## Median : 0.01757 Median : 0.09450 Median :-0.0258 Median : 5.486
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 5.535
## 3rd Qu.: 0.29446 3rd Qu.: 0.10978 3rd Qu.: 1.0416 3rd Qu.: 6.919
## Max. : 0.79336 Max. : 0.51344 Max. : 2.5059 Max. :10.000
##
## alerta bmi sobrepeso obesidad fecha.hba1c
## No:333 Min. :14.31 No:301 No:518 Min. :2006-02-01
## Sí:367 1st Qu.:22.91 Sí:399 Sí:182 1st Qu.:2016-03-01
## Median :25.96 Median :2016-05-04
## Mean :27.08 Mean :2016-03-07
## 3rd Qu.:30.14 3rd Qu.:2016-06-07
## Max. :76.03 Max. :2016-07-04
##
## fecha.ldl fecha.hdl fecha.trig
## Min. :2012-11-08 Min. :2006-04-23 Min. :2012-11-08
## 1st Qu.:2016-02-04 1st Qu.:2016-01-13 1st Qu.:2016-02-02
## Median :2016-04-21 Median :2016-03-29 Median :2016-04-21
## Mean :2016-02-19 Mean :2016-01-26 Mean :2016-02-27
## 3rd Qu.:2016-05-31 3rd Qu.:2016-05-31 3rd Qu.:2016-05-31
## Max. :2016-09-03 Max. :2016-12-09 Max. :2016-12-21
## NA's :1
## fecha.bp age_out dmdura_out hba1c_out ldl_out hdl_out trig_out
## Min. :2006-04-27 0:692 0:692 0:666 0:674 0:683 0:665
## 1st Qu.:2016-04-22 1: 8 1: 8 1: 34 1: 26 1: 17 1: 35
## Median :2016-05-19
## Mean :2016-05-14
## 3rd Qu.:2016-06-16
## Max. :2035-03-27
## NA's :1
## sbp_out dbp_out DMSES.Diet_out DMSES.Monitor_out DMSES.Physical_out
## 0:684 0:687 0:700 0:700 0:696
## 1: 16 1: 13 1: 4
##
##
##
##
##
## DMSES.Regimen_out DMSES.Total_out DK.10_out bmi_out
## 0:578 0:698 0:694 0:686
## 1:122 1: 2 1: 6 1: 14
##
##
##
##
##
# flags para outliers
vars_out <- c("age_out","dmdura_out","hba1c_out","ldl_out","hdl_out","trig_out","sbp_out","dbp_out",
"DMSES.Diet_out","DMSES.Monitor_out","DMSES.Physical_out","DMSES.Regimen_out",
"DMSES.Total_out","DK.10_out")
# la variable datos$out valdrá uno si alguna de las variables continuas es outlier, 0 en caso contrario
datos$out <- apply(datos[,vars_out],1,max)
# variables continuas
vars_cont <- c("age","dmdura","hba1c","ldl","hdl","trig","sbp","dbp",
"DMSES.Diet","DMSES.Monitor","DMSES.Physical","DMSES.Regimen",
"DMSES.Total","DK.10")
# variables categóricas
vars_cat <- c("id.code","hospid","gender","mstatus","edu","religion","income",
"famhx","comob","comlip","comht","comchd","comkid","comoth","dmrx",
"smk","alcohol","compli","cva","cereinfrac","ishemic","stroke","cerebhem",
"tia","angia","chf","mi","cororevas","pad","neuropath","renal","dn",
"dr","othcomp","alerta","sobrepeso","obesidad",vars_out,"out")
# otras variables
vars_otras <- c("fecha.hba1c","fecha.ldl","fecha.hdl","fecha.trig","fecha.bp")
# todas las variables
vars <- c(vars_cat,vars_cont,vars_otras)
# pacientes con alguna variable continua outlier
pat_out <- datos %>% filter(out==1)
dim(pat_out)
## [1] 234 73
dim(datos)
## [1] 700 73
attach(datos)
La estructura del fichero inicial, ya con todas las variables derivadas creadas es la que sigue:
# descripcion del fichero con el que se trabajará
str(datos)
## 'data.frame': 700 obs. of 73 variables:
## $ id.code : Factor w/ 700 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ hospid : Factor w/ 4 levels "phupaman","srinakarin",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : Factor w/ 2 levels "Hombre","Mujer": 1 2 1 2 1 2 1 2 2 2 ...
## $ mstatus : Factor w/ 5 levels "Soltero","´Casado",..: 2 2 2 3 3 2 2 2 2 2 ...
## $ age : int 51 55 47 54 71 55 59 64 58 69 ...
## $ edu : Factor w/ 6 levels "Sin estudios",..: 4 2 2 2 2 2 2 2 2 1 ...
## $ religion : Factor w/ 4 levels "Budismo","Islam",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ income : Factor w/ 6 levels "menos de 4,999 bath",..: 5 2 1 1 1 1 6 1 1 1 ...
## $ dmdura : int 4 4 5 20 10 4 4 22 4 14 ...
## $ famhx : Factor w/ 2 levels "No","Sí": 2 2 1 1 1 1 1 1 2 1 ...
## $ comob : Factor w/ 2 levels "No","Sí": 2 2 2 2 2 2 2 2 1 2 ...
## $ comlip : Factor w/ 2 levels "No","Sí": 2 2 2 2 1 2 2 2 1 2 ...
## $ comht : Factor w/ 2 levels "No","Sí": 2 1 2 2 2 1 2 2 1 2 ...
## $ comchd : Factor w/ 2 levels "No","Sí": 1 1 1 1 2 1 1 1 1 1 ...
## $ comkid : Factor w/ 2 levels "No","Sí": 1 1 2 1 2 1 1 1 1 1 ...
## $ comoth : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ dmrx : Factor w/ 4 levels "Sin medicación",..: 2 2 2 4 3 2 2 2 2 4 ...
## $ smk : Factor w/ 3 levels "Sí","Exfumador",..: 2 3 2 3 2 3 3 3 3 3 ...
## $ alcohol : Factor w/ 3 levels "Sí","Exbebedor",..: 2 3 2 3 1 3 2 3 3 3 ...
## $ hba1c : num 9.7 7.1 8 7.2 8.8 8 9.8 6.3 6.7 7.7 ...
## $ ldl : num 123.7 89.8 119 202.1 58.6 ...
## $ hdl : num 45.2 49.5 50.7 47 35.7 35.3 42 44.9 46 35.6 ...
## $ trig : num 248.9 276.9 165.6 84.7 149.5 ...
## $ sbp : int 130 120 168 121 118 119 119 150 124 169 ...
## $ dbp : int 80 70 94 56 65 63 83 76 76 93 ...
## $ compli : Factor w/ 2 levels "No","Sí": 1 1 2 1 2 1 1 1 1 1 ...
## $ cva : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ cereinfrac : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ ishemic : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ stroke : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ cerebhem : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ tia : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ angia : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ chf : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ mi : Factor w/ 2 levels "No","Sí": 1 1 1 1 2 1 1 1 1 1 ...
## $ cororevas : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ pad : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ neuropath : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ renal : Factor w/ 2 levels "No","Sí": 1 1 2 1 2 1 1 1 1 1 ...
## $ dn : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ dr : Factor w/ 2 levels "No","Sí": 1 1 2 1 1 1 1 1 1 1 ...
## $ othcomp : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ DMSES.Diet : num -0.132233 -0.000414 1.031174 -1.102578 -1.064826 ...
## $ DMSES.Monitor : num -0.133 0.158 0.527 -0.168 -0.37 ...
## $ DMSES.Physical : num -0.1775 0.09218 0.79015 -0.00571 0.02158 ...
## $ DMSES.Regimen : num 0.0909 0.106 -0.0232 0.0916 0.0824 ...
## $ DMSES.Total : num -0.352 0.355 2.325 -1.184 -1.331 ...
## $ DK.10 : num 5.77 6.66 4.27 5.19 5.65 ...
## $ alerta : Factor w/ 2 levels "No","Sí": 2 2 2 2 2 2 2 1 1 2 ...
## $ bmi : num 23 25.6 22.6 25.7 21.1 ...
## $ sobrepeso : Factor w/ 2 levels "No","Sí": 1 2 1 2 1 1 2 1 2 1 ...
## $ obesidad : Factor w/ 2 levels "No","Sí": 1 1 1 1 1 1 1 1 1 1 ...
## $ fecha.hba1c : Date, format: "2014-07-10" "2014-07-26" ...
## $ fecha.ldl : Date, format: "2015-11-06" "2015-02-16" ...
## $ fecha.hdl : Date, format: "2015-11-06" "2015-02-16" ...
## $ fecha.trig : Date, format: "2015-11-06" "2015-02-16" ...
## $ fecha.bp : Date, format: "2016-02-19" "2016-02-24" ...
## $ age_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ dmdura_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ hba1c_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ldl_out : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ hdl_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ trig_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ sbp_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ dbp_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DMSES.Diet_out : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
## $ DMSES.Monitor_out : Factor w/ 1 level "0": 1 1 1 1 1 1 1 1 1 1 ...
## $ DMSES.Physical_out: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DMSES.Regimen_out : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 2 1 1 1 ...
## $ DMSES.Total_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DK.10_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ bmi_out : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ out : chr "0" "0" "1" "1" ...
Una primera vista a las variables de nuestro fichero:
summary(datos[,-c(1)])
## hospid gender mstatus age
## phupaman : 60 Hombre:208 Soltero : 57 Min. :26.00
## srinakarin: 77 Mujer :492 ´Casado :462 1st Qu.:59.00
## wegaronrat:241 Viudo :165 Median :65.00
## chula :322 Divorciado: 13 Mean :65.16
## Separado : 3 3rd Qu.:73.00
## Max. :95.00
##
## edu religion income
## Sin estudios : 47 Budismo :543 menos de 4,999 bath:318
## Primarios :381 Islam :152 5,000-9,999 bath : 95
## Secundarios :146 Cristianismo : 5 10,000-14,999 bath : 86
## Grado : 99 Otra religión: 0 15,000-19,999 bath : 48
## Master : 25 20,000-24,999 bath : 48
## Grado Superior: 2 más de 25,000 bath :105
##
## dmdura famhx comob comlip comht comchd comkid comoth
## Min. : 4.00 No:370 No: 42 No: 95 No: 97 No:662 No:512 No:699
## 1st Qu.: 7.00 Sí:330 Sí:658 Sí:605 Sí:603 Sí: 38 Sí:188 Sí: 1
## Median :10.00
## Mean :13.53
## 3rd Qu.:20.00
## Max. :45.00
##
## dmrx smk alcohol hba1c
## Sin medicación : 12 Sí : 23 Sí : 42 Min. : 2.000
## Hipoglicémico oral:409 Exfumador: 88 Exbebedor: 89 1st Qu.: 6.400
## Insulina : 94 No :589 No :569 Median : 7.100
## Ambos :185 Mean : 7.579
## 3rd Qu.: 8.300
## Max. :15.000
##
## ldl hdl trig sbp
## Min. : 8.7 Min. : 17.00 Min. : 29.30 Min. : 93.0
## 1st Qu.: 81.0 1st Qu.: 41.88 1st Qu.: 95.75 1st Qu.:123.0
## Median : 97.0 Median : 50.00 Median :127.00 Median :133.0
## Mean :100.8 Mean : 50.75 Mean :149.02 Mean :134.8
## 3rd Qu.:116.0 3rd Qu.: 58.00 3rd Qu.:175.00 3rd Qu.:145.0
## Max. :221.0 Max. :116.00 Max. :999.00 Max. :217.0
##
## dbp compli cva cereinfrac ishemic stroke cerebhem
## Min. : 22.00 No:429 No:699 No:700 No:688 No:698 No:700
## 1st Qu.: 65.00 Sí:271 Sí: 1 Sí: 0 Sí: 12 Sí: 2 Sí: 0
## Median : 73.00
## Mean : 73.02
## 3rd Qu.: 80.00
## Max. :112.00
##
## tia angia chf mi cororevas pad neuropath renal
## No:699 No:700 No:692 No:675 No:699 No:696 No:695 No:582
## Sí: 1 Sí: 0 Sí: 8 Sí: 25 Sí: 1 Sí: 4 Sí: 5 Sí:118
##
##
##
##
##
## dn dr othcomp DMSES.Diet DMSES.Monitor
## No:619 No:561 No:699 Min. :-1.45946 Min. :-0.84225
## Sí: 81 Sí:139 Sí: 1 1st Qu.:-0.53693 1st Qu.:-0.23065
## Median :-0.06537 Median :-0.03718
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.52029 3rd Qu.: 0.24502
## Max. : 1.03434 Max. : 0.55056
##
## DMSES.Physical DMSES.Regimen DMSES.Total DK.10
## Min. :-1.28709 Min. :-2.86027 Min. :-5.6289 Min. : 0.000
## 1st Qu.:-0.28906 1st Qu.: 0.07856 1st Qu.:-1.0268 1st Qu.: 4.224
## Median : 0.01757 Median : 0.09450 Median :-0.0258 Median : 5.486
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 5.535
## 3rd Qu.: 0.29446 3rd Qu.: 0.10978 3rd Qu.: 1.0416 3rd Qu.: 6.919
## Max. : 0.79336 Max. : 0.51344 Max. : 2.5059 Max. :10.000
##
## alerta bmi sobrepeso obesidad fecha.hba1c
## No:333 Min. :14.31 No:301 No:518 Min. :2006-02-01
## Sí:367 1st Qu.:22.91 Sí:399 Sí:182 1st Qu.:2016-03-01
## Median :25.96 Median :2016-05-04
## Mean :27.08 Mean :2016-03-07
## 3rd Qu.:30.14 3rd Qu.:2016-06-07
## Max. :76.03 Max. :2016-07-04
##
## fecha.ldl fecha.hdl fecha.trig
## Min. :2012-11-08 Min. :2006-04-23 Min. :2012-11-08
## 1st Qu.:2016-02-04 1st Qu.:2016-01-13 1st Qu.:2016-02-02
## Median :2016-04-21 Median :2016-03-29 Median :2016-04-21
## Mean :2016-02-19 Mean :2016-01-26 Mean :2016-02-27
## 3rd Qu.:2016-05-31 3rd Qu.:2016-05-31 3rd Qu.:2016-05-31
## Max. :2016-09-03 Max. :2016-12-09 Max. :2016-12-21
## NA's :1
## fecha.bp age_out dmdura_out hba1c_out ldl_out hdl_out trig_out
## Min. :2006-04-27 0:692 0:692 0:666 0:674 0:683 0:665
## 1st Qu.:2016-04-22 1: 8 1: 8 1: 34 1: 26 1: 17 1: 35
## Median :2016-05-19
## Mean :2016-05-14
## 3rd Qu.:2016-06-16
## Max. :2035-03-27
## NA's :1
## sbp_out dbp_out DMSES.Diet_out DMSES.Monitor_out DMSES.Physical_out
## 0:684 0:687 0:700 0:700 0:696
## 1: 16 1: 13 1: 4
##
##
##
##
##
## DMSES.Regimen_out DMSES.Total_out DK.10_out bmi_out out
## 0:578 0:698 0:694 0:686 Length:700
## 1:122 1: 2 1: 6 1: 14 Class :character
## Mode :character
##
##
##
##
Los únicos valores faltantes se dan en las variables fecha.ldl y fecha.bp, un único caso para cada variable. De momento se decide no eliminar los registros.
Analizamos con un poco más de detalle las variables categóricas
Analizamos con un poco más de detalle las variables continuas.
bx_age <- ggplot(datos,aes(y=age)) + geom_boxplot()
bx_dmdura <- ggplot(datos,aes(y=dmdura)) + geom_boxplot()
bx_hba1c <- ggplot(datos,aes(y=hba1c)) + geom_boxplot()
bx_ldl <- ggplot(datos,aes(y=ldl)) + geom_boxplot()
bx_hdl <- ggplot(datos,aes(y=hdl)) + geom_boxplot()
bx_trig <- ggplot(datos,aes(y=trig)) + geom_boxplot()
bx_sbp <- ggplot(datos,aes(y=sbp)) + geom_boxplot()
bx_dbp <- ggplot(datos,aes(y=dbp)) + geom_boxplot()
bx_DMSES.Diet <- ggplot(datos,aes(y=DMSES.Diet)) + geom_boxplot()
bx_DMSES.Monitor <- ggplot(datos,aes(y=DMSES.Monitor)) + geom_boxplot()
bx_DMSES.Physical <- ggplot(datos,aes(y=DMSES.Physical)) + geom_boxplot()
bx_DMSES.Regimen <- ggplot(datos,aes(y=DMSES.Regimen)) + geom_boxplot()
bx_DMSES.Total <- ggplot(datos,aes(y=DMSES.Total)) + geom_boxplot()
bx_DK.10 <- ggplot(datos,aes(y=DK.10)) + geom_boxplot()
bx_bmi <- ggplot(datos,aes(y=bmi)) + geom_boxplot()
grid.arrange(bx_age,bx_dmdura,bx_hba1c,bx_ldl,bx_hdl,bx_trig,bx_sbp,bx_dbp,
bx_DMSES.Diet, bx_DMSES.Monitor, bx_DMSES.Physical, bx_DMSES.Regimen,
bx_DMSES.Total, bx_DK.10, bx_bmi,ncol=4)
den_age <- ggplot(datos,aes(x=age)) + geom_density()
den_dmdura <- ggplot(datos,aes(x=dmdura)) + geom_density()
den_hba1c <- ggplot(datos,aes(x=hba1c)) + geom_density()
den_ldl <- ggplot(datos,aes(x=ldl)) + geom_density()
den_hdl <- ggplot(datos,aes(x=hdl)) + geom_density()
den_trig <- ggplot(datos,aes(x=trig)) + geom_density()
den_sbp <- ggplot(datos,aes(x=sbp)) + geom_density()
den_dbp <- ggplot(datos,aes(x=dbp)) + geom_density()
den_DMSES.Diet <- ggplot(datos,aes(x=DMSES.Diet)) + geom_density()
den_DMSES.Monitor <- ggplot(datos,aes(x=DMSES.Monitor)) + geom_density()
den_DMSES.Physical <- ggplot(datos,aes(x=DMSES.Physical)) + geom_density()
den_DMSES.Regimen <- ggplot(datos,aes(x=DMSES.Regimen)) + geom_density()
den_DMSES.Total <- ggplot(datos,aes(x=DMSES.Total)) + geom_density()
den_DK.10 <- ggplot(datos,aes(x=DK.10)) + geom_density()
den_bmi <- ggplot(datos,aes(x=bmi)) + geom_density()
grid.arrange(den_age,den_dmdura,den_hba1c,den_ldl,den_hdl,den_trig,den_sbp,den_dbp,
den_DMSES.Diet, den_DMSES.Monitor, den_DMSES.Physical, den_DMSES.Regimen,
den_DMSES.Total, den_DK.10, den_bmi,ncol=4)
Ningun predictor sigue una distribución normal según el test de normalidad de Shapiro y según el QQ plot tenemos normalidad en eda, hdl, sbp, dbp, DMSES.Total y DK.10.
shapiro.test(datos$age)
##
## Shapiro-Wilk normality test
##
## data: datos$age
## W = 0.99459, p-value = 0.01385
shapiro.test(datos$dmdura)
##
## Shapiro-Wilk normality test
##
## data: datos$dmdura
## W = 0.90381, p-value < 2.2e-16
shapiro.test(datos$hba1c)
##
## Shapiro-Wilk normality test
##
## data: datos$hba1c
## W = 0.88872, p-value < 2.2e-16
shapiro.test(datos$ldl)
##
## Shapiro-Wilk normality test
##
## data: datos$ldl
## W = 0.95629, p-value = 1.435e-13
shapiro.test(datos$hdl)
##
## Shapiro-Wilk normality test
##
## data: datos$hdl
## W = 0.97306, p-value = 4.687e-10
shapiro.test(datos$trig)
##
## Shapiro-Wilk normality test
##
## data: datos$trig
## W = 0.73709, p-value < 2.2e-16
shapiro.test(datos$sbp)
##
## Shapiro-Wilk normality test
##
## data: datos$sbp
## W = 0.97534, p-value = 1.774e-09
shapiro.test(datos$dbp)
##
## Shapiro-Wilk normality test
##
## data: datos$dbp
## W = 0.99086, p-value = 0.0002488
shapiro.test(datos$DMSES.Diet)
##
## Shapiro-Wilk normality test
##
## data: datos$DMSES.Diet
## W = 0.94977, p-value = 1.089e-14
shapiro.test(datos$DMSES.Monitor)
##
## Shapiro-Wilk normality test
##
## data: datos$DMSES.Monitor
## W = 0.97283, p-value = 4.121e-10
shapiro.test(datos$DMSES.Physical)
##
## Shapiro-Wilk normality test
##
## data: datos$DMSES.Physical
## W = 0.9796, p-value = 2.638e-08
shapiro.test(datos$DMSES.Regimen)
##
## Shapiro-Wilk normality test
##
## data: datos$DMSES.Regimen
## W = 0.41931, p-value < 2.2e-16
shapiro.test(datos$DMSES.Total)
##
## Shapiro-Wilk normality test
##
## data: datos$DMSES.Total
## W = 0.98158, p-value = 1.038e-07
shapiro.test(datos$DK.10)
##
## Shapiro-Wilk normality test
##
## data: datos$DK.10
## W = 0.99402, p-value = 0.007196
shapiro.test(datos$bmi)
##
## Shapiro-Wilk normality test
##
## data: datos$bmi
## W = 0.88978, p-value < 2.2e-16
#ggqqplot(datos[,vars_cont],combine = TRUE,facet.by=c(5,3))
qq_age <- ggqqplot(datos$age)
qq_dmdura <- ggqqplot(datos$dmdura)
qq_hba1c <- ggqqplot(datos$hba1c)
qq_ldl <- ggqqplot(datos$ldl)
qq_hdl <- ggqqplot(datos$hdl)
qq_trig <- ggqqplot(datos$trig)
qq_sbp <- ggqqplot(datos$sbp)
qq_dbp <- ggqqplot(datos$dbp)
qq_DMSES.Diet <- ggqqplot(datos$DMSES.Diet)
qq_DMSES.Monitor <- ggqqplot(datos$DMSES.Monitor)
qq_DMSES.Physical <- ggqqplot(datos$DMSES.Physical)
qq_DMSES.Regimen <- ggqqplot(datos$DMSES.Regimen)
qq_DMSES.Total <- ggqqplot(datos$DMSES.Total)
qq_DK.10 <- ggqqplot(datos$DK.10)
qq_bmi <- ggqqplot(datos$bmi)
#Arrange all the plots onto one page
#plot_grid(qq_age, qq_dmdura, qq_hba1c, qq_ldl, qq_hdl, qq_trig,
# qq_sbp, qq_dbp, qq_DMSES.Diet, qq_DMSES.Monitor,
# qq_DMSES.Physical, qq_DMSES.Regimen, qq_DMSES.Total,
# qq_DK.10, qq_bmi, nrow=5,ncol=3)
#grid.arrange(qq_age, qq_dmdura, qq_hba1c, qq_ldl, qq_hdl, qq_trig,
# qq_sbp, qq_dbp, qq_DMSES.Diet, qq_DMSES.Monitor,
# qq_DMSES.Physical, qq_DMSES.Regimen, qq_DMSES.Total,
# qq_DK.10, qq_bmi, ncol=2)
grid.arrange(qq_age, qq_dmdura, qq_hba1c, qq_ldl, qq_hdl, qq_trig, ncol=3)
grid.arrange(qq_sbp, qq_dbp, qq_DMSES.Diet, qq_DMSES.Monitor,
qq_DMSES.Physical, qq_DMSES.Regimen, ncol = 3)
grid.arrange(qq_DMSES.Total,qq_DK.10, qq_bmi, ncol=3)
Los niveles de correlación entre las puntuaciones DMSES y el resto de variables continuas son bajos a excepción de la variable hba1c. La mayor correlación entre hba1c y las puntuaciones DMSES se da con el factor dieta. Se detecta que las diferentes puntuaciones del cuestionario DMSES estan fuertemente relacionas, por lo que se puede considerar tomar sólo una para el análisis. La puntuación total y la de la parte de la dieta son las mejoras candidatas debido a que tienen una correlación más alta con hba1c.
dim(datos[,vars_cont])
## [1] 700 14
str(datos[,vars_cont])
## 'data.frame': 700 obs. of 14 variables:
## $ age : int 51 55 47 54 71 55 59 64 58 69 ...
## $ dmdura : int 4 4 5 20 10 4 4 22 4 14 ...
## $ hba1c : num 9.7 7.1 8 7.2 8.8 8 9.8 6.3 6.7 7.7 ...
## $ ldl : num 123.7 89.8 119 202.1 58.6 ...
## $ hdl : num 45.2 49.5 50.7 47 35.7 35.3 42 44.9 46 35.6 ...
## $ trig : num 248.9 276.9 165.6 84.7 149.5 ...
## $ sbp : int 130 120 168 121 118 119 119 150 124 169 ...
## $ dbp : int 80 70 94 56 65 63 83 76 76 93 ...
## $ DMSES.Diet : num -0.132233 -0.000414 1.031174 -1.102578 -1.064826 ...
## $ DMSES.Monitor : num -0.133 0.158 0.527 -0.168 -0.37 ...
## $ DMSES.Physical: num -0.1775 0.09218 0.79015 -0.00571 0.02158 ...
## $ DMSES.Regimen : num 0.0909 0.106 -0.0232 0.0916 0.0824 ...
## $ DMSES.Total : num -0.352 0.355 2.325 -1.184 -1.331 ...
## $ DK.10 : num 5.77 6.66 4.27 5.19 5.65 ...
M <- cor(datos[,c(vars_cont)])
knitr::kable(round(M,2))
| age | dmdura | hba1c | ldl | hdl | trig | sbp | dbp | DMSES.Diet | DMSES.Monitor | DMSES.Physical | DMSES.Regimen | DMSES.Total | DK.10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 1.00 | 0.44 | -0.24 | -0.13 | -0.06 | -0.13 | 0.03 | -0.21 | 0.16 | 0.14 | -0.04 | 0.17 | 0.13 | -0.18 |
| dmdura | 0.44 | 1.00 | -0.02 | -0.10 | -0.05 | -0.11 | -0.01 | -0.18 | -0.02 | -0.05 | -0.11 | 0.03 | -0.05 | 0.05 |
| hba1c | -0.24 | -0.02 | 1.00 | 0.17 | 0.00 | 0.13 | 0.12 | 0.11 | -0.56 | -0.50 | -0.30 | -0.22 | -0.53 | 0.06 |
| ldl | -0.13 | -0.10 | 0.17 | 1.00 | 0.27 | 0.17 | 0.13 | 0.14 | -0.05 | -0.05 | 0.02 | 0.04 | -0.02 | -0.04 |
| hdl | -0.06 | -0.05 | 0.00 | 0.27 | 1.00 | -0.02 | 0.00 | 0.05 | 0.04 | -0.01 | 0.02 | 0.02 | 0.03 | 0.05 |
| trig | -0.13 | -0.11 | 0.13 | 0.17 | -0.02 | 1.00 | 0.12 | 0.17 | -0.09 | -0.08 | -0.04 | 0.02 | -0.07 | -0.12 |
| sbp | 0.03 | -0.01 | 0.12 | 0.13 | 0.00 | 0.12 | 1.00 | 0.53 | -0.05 | -0.02 | -0.02 | 0.02 | -0.03 | 0.03 |
| dbp | -0.21 | -0.18 | 0.11 | 0.14 | 0.05 | 0.17 | 0.53 | 1.00 | -0.05 | -0.05 | -0.02 | -0.04 | -0.05 | 0.04 |
| DMSES.Diet | 0.16 | -0.02 | -0.56 | -0.05 | 0.04 | -0.09 | -0.05 | -0.05 | 1.00 | 0.83 | 0.58 | 0.22 | 0.91 | -0.08 |
| DMSES.Monitor | 0.14 | -0.05 | -0.50 | -0.05 | -0.01 | -0.08 | -0.02 | -0.05 | 0.83 | 1.00 | 0.56 | 0.28 | 0.88 | -0.03 |
| DMSES.Physical | -0.04 | -0.11 | -0.30 | 0.02 | 0.02 | -0.04 | -0.02 | -0.02 | 0.58 | 0.56 | 1.00 | 0.18 | 0.78 | 0.06 |
| DMSES.Regimen | 0.17 | 0.03 | -0.22 | 0.04 | 0.02 | 0.02 | 0.02 | -0.04 | 0.22 | 0.28 | 0.18 | 1.00 | 0.46 | -0.08 |
| DMSES.Total | 0.13 | -0.05 | -0.53 | -0.02 | 0.03 | -0.07 | -0.03 | -0.05 | 0.91 | 0.88 | 0.78 | 0.46 | 1.00 | -0.05 |
| DK.10 | -0.18 | 0.05 | 0.06 | -0.04 | 0.05 | -0.12 | 0.03 | 0.04 | -0.08 | -0.03 | 0.06 | -0.08 | -0.05 | 1.00 |
corrplot(M,method="circle")
# vamos a ordenar las correlaciones
flattenCorrMatrix <- function(cormat) {
ut <- upper.tri(cormat)
dd <- data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut]
)
dd <- dd[order(abs(dd$cor), decreasing = T),]
dd$cor <- round(dd$cor, 3)
dd
}
flattenCorrMatrix(M)
## row column cor
## 75 DMSES.Diet DMSES.Total 0.908
## 76 DMSES.Monitor DMSES.Total 0.876
## 45 DMSES.Diet DMSES.Monitor 0.832
## 77 DMSES.Physical DMSES.Total 0.779
## 54 DMSES.Diet DMSES.Physical 0.578
## 31 hba1c DMSES.Diet -0.563
## 55 DMSES.Monitor DMSES.Physical 0.558
## 69 hba1c DMSES.Total -0.534
## 28 sbp dbp 0.530
## 39 hba1c DMSES.Monitor -0.499
## 78 DMSES.Regimen DMSES.Total 0.461
## 1 age dmdura 0.436
## 48 hba1c DMSES.Physical -0.303
## 65 DMSES.Monitor DMSES.Regimen 0.281
## 10 ldl hdl 0.271
## 2 age hba1c -0.241
## 58 hba1c DMSES.Regimen -0.221
## 64 DMSES.Diet DMSES.Regimen 0.215
## 22 age dbp -0.211
## 79 age DK.10 -0.183
## 23 dmdura dbp -0.179
## 66 DMSES.Physical DMSES.Regimen 0.178
## 56 age DMSES.Regimen 0.172
## 14 ldl trig 0.170
## 6 hba1c ldl 0.167
## 27 trig dbp 0.165
## 29 age DMSES.Diet 0.165
## 25 ldl dbp 0.137
## 37 age DMSES.Monitor 0.136
## 67 age DMSES.Total 0.135
## 13 hba1c trig 0.131
## 11 age trig -0.131
## 19 ldl sbp 0.131
## 4 age ldl -0.129
## 21 trig sbp 0.119
## 18 hba1c sbp 0.118
## 84 trig DK.10 -0.117
## 47 dmdura DMSES.Physical -0.113
## 24 hba1c dbp 0.105
## 12 dmdura trig -0.105
## 5 dmdura ldl -0.102
## 34 trig DMSES.Diet -0.090
## 87 DMSES.Diet DK.10 -0.084
## 42 trig DMSES.Monitor -0.078
## 90 DMSES.Regimen DK.10 -0.076
## 72 trig DMSES.Total -0.069
## 7 age hdl -0.059
## 81 hba1c DK.10 0.056
## 89 DMSES.Physical DK.10 0.055
## 36 dbp DMSES.Diet -0.054
## 40 ldl DMSES.Monitor -0.053
## 74 dbp DMSES.Total -0.053
## 35 sbp DMSES.Diet -0.053
## 44 dbp DMSES.Monitor -0.052
## 68 dmdura DMSES.Total -0.052
## 26 hdl dbp 0.051
## 32 ldl DMSES.Diet -0.051
## 80 dmdura DK.10 0.051
## 38 dmdura DMSES.Monitor -0.049
## 8 dmdura hdl -0.048
## 91 DMSES.Total DK.10 -0.046
## 83 hdl DK.10 0.046
## 46 age DMSES.Physical -0.043
## 51 trig DMSES.Physical -0.043
## 82 ldl DK.10 -0.040
## 59 ldl DMSES.Regimen 0.038
## 86 dbp DK.10 0.038
## 33 hdl DMSES.Diet 0.037
## 63 dbp DMSES.Regimen -0.036
## 73 sbp DMSES.Total -0.032
## 57 dmdura DMSES.Regimen 0.031
## 88 DMSES.Monitor DK.10 -0.030
## 16 age sbp 0.028
## 71 hdl DMSES.Total 0.027
## 85 sbp DK.10 0.026
## 50 hdl DMSES.Physical 0.025
## 61 trig DMSES.Regimen 0.024
## 15 hdl trig -0.023
## 49 ldl DMSES.Physical 0.023
## 62 sbp DMSES.Regimen 0.022
## 43 sbp DMSES.Monitor -0.022
## 52 sbp DMSES.Physical -0.021
## 30 dmdura DMSES.Diet -0.021
## 53 dbp DMSES.Physical -0.020
## 70 ldl DMSES.Total -0.020
## 60 hdl DMSES.Regimen 0.017
## 3 dmdura hba1c -0.016
## 17 dmdura sbp -0.012
## 41 hdl DMSES.Monitor -0.012
## 20 hdl sbp -0.005
## 9 hba1c hdl 0.003
cor(hba1c,DMSES.Diet)
## [1] -0.5630537
cor(hba1c,DMSES.Physical)
## [1] -0.3027804
cor(hba1c,DMSES.Monitor)
## [1] -0.4987917
cor(hba1c,DMSES.Regimen)
## [1] -0.2206762
cor(hba1c,DMSES.Total)
## [1] -0.5339001
Podemos ver que las diferentes puntuaciones del cuestionario DMSES (total o de alguna de sus partes) están muy relacionadas. De entre todas las puntuaciones, la del factor dieta es la que mayor correlación tienen con la hba1c, por lo que parece razonable centrarse en el uso de sólo esa puntuación para buscar una relación con la hba1c y descartar el resto de puntuaciones del cuestionario DMSES del análisis.
Vemos en que medida están relacionadas las puntuaciones del cuestiorio DMSES entre sí.
# demostracion DMSES total es la suma de los 4 subscores
max(DMSES.Total - (DMSES.Diet + DMSES.Monitor + DMSES.Physical + DMSES.Regimen))
## [1] 7.105427e-15
# relación entre los diferentes subscores
ggpairs(datos[,c("DMSES.Diet","DMSES.Monitor","DMSES.Physical","DMSES.Regimen","DMSES.Total")])
Al querer buscar las relaciones entre las puntuaciones del cuestionario DMSES (la puntuación total y la de las partes ‘dieta’, ‘monitor’, ‘régimen’ y ‘estado físico’) y el nivel de hba1c se observa que no hay una relación lineal a priori. Se valorará hacer transformaciones adecuadas para comprobar si se puede hallar una relación.
par(mfrow=c(2,3))
pairs(~ hba1c + DMSES.Diet ,data=datos)
pairs(~ hba1c + DMSES.Monitor,data=datos)
pairs(~ hba1c + DMSES.Physical,data=datos)
pairs(~ hba1c + DMSES.Regimen,data=datos)
pairs(~ hba1c + DMSES.Total,data=datos)
par(mfrow=c(1,1))
Tampoco se detecta una relación clara entre las puntuaciones del cuestionario DMSES y la variable categórica ‘alerta’.
pairs(~ alerta + DMSES.Diet ,data=datos)
pairs(~ alerta + DMSES.Monitor,data=datos)
pairs(~ alerta + DMSES.Physical,data=datos)
pairs(~ alerta + DMSES.Regimen,data=datos)
pairs(~ alerta + DMSES.Total,data=datos)
Análisis multivariante discreto
Las variables discretas con poca variabilidad no se toman en cuenta (cva, cereinfrac, ishemic, stroke, cerebhem,tia, angia, chf, cororevas, pad, neuropath )
chisq.test(datos$alerta,datos$hospid)
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$hospid
## X-squared = 21.203, df = 3, p-value = 9.551e-05
chisq.test(datos$alerta,datos$gender)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$gender
## X-squared = 0.32616, df = 1, p-value = 0.5679
chisq.test(datos$alerta,datos$mstatus)
## Warning in chisq.test(datos$alerta, datos$mstatus): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$mstatus
## X-squared = 12.419, df = 4, p-value = 0.01449
chisq.test(datos$alerta,datos$edu)
## Warning in chisq.test(datos$alerta, datos$edu): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$edu
## X-squared = 8.9276, df = 5, p-value = 0.112
chisq.test(datos$alerta,datos$religion)
## Warning in chisq.test(datos$alerta, datos$religion): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$religion
## X-squared = 2.6512, df = 2, p-value = 0.2656
chisq.test(datos$alerta,datos$income)
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$income
## X-squared = 11.121, df = 5, p-value = 0.04904
chisq.test(datos$alerta,datos$famhx)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$famhx
## X-squared = 0.9669, df = 1, p-value = 0.3255
chisq.test(datos$alerta,datos$comob)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comob
## X-squared = 1.2299, df = 1, p-value = 0.2674
chisq.test(datos$alerta,datos$comlip)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comlip
## X-squared = 1.0754, df = 1, p-value = 0.2997
chisq.test(datos$alerta,datos$comht)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comht
## X-squared = 0.63726, df = 1, p-value = 0.4247
chisq.test(datos$alerta,datos$comchd)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comchd
## X-squared = 4.8263, df = 1, p-value = 0.02803
chisq.test(datos$alerta,datos$comkid)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comkid
## X-squared = 39.775, df = 1, p-value = 2.849e-10
chisq.test(datos$alerta,datos$comoth)
## Warning in chisq.test(datos$alerta, datos$comoth): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$comoth
## X-squared = 4.3425e-28, df = 1, p-value = 1
chisq.test(datos$alerta,datos$dmrx)
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$dmrx
## X-squared = 75.467, df = 3, p-value = 2.878e-16
chisq.test(datos$alerta,datos$smk)
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$smk
## X-squared = 0.77241, df = 2, p-value = 0.6796
chisq.test(datos$alerta,datos$alcohol)
##
## Pearson's Chi-squared test
##
## data: datos$alerta and datos$alcohol
## X-squared = 0.75205, df = 2, p-value = 0.6866
chisq.test(datos$alerta,datos$compli)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$compli
## X-squared = 97.093, df = 1, p-value < 2.2e-16
chisq.test(datos$alerta,datos$mi)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$mi
## X-squared = 6.7972, df = 1, p-value = 0.00913
chisq.test(datos$alerta,datos$renal)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$renal
## X-squared = 22.828, df = 1, p-value = 1.772e-06
chisq.test(datos$alerta,datos$dn)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$dn
## X-squared = 14.389, df = 1, p-value = 0.0001487
chisq.test(datos$alerta,datos$dr)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$dr
## X-squared = 65.391, df = 1, p-value = 6.141e-16
chisq.test(datos$alerta,datos$sobrepeso)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$sobrepeso
## X-squared = 2.4841, df = 1, p-value = 0.115
chisq.test(datos$alerta,datos$obesidad)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: datos$alerta and datos$obesidad
## X-squared = 0.1288, df = 1, p-value = 0.7197
El objetivo de este apartado será comprobar si la puntuación del test DMSES (total o de alguno de sus apartados) puede predecir el valor de la variable ‘hba1c’.
par(mfrow=c(2,2))
mod <- lm(hba1c ~ DMSES.Total + DK.10 + age + renal + ldl + bmi,data=datos)
(smod <- summary(mod))
##
## Call:
## lm(formula = hba1c ~ DMSES.Total + DK.10 + age + renal + ldl +
## bmi, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6945 -0.8032 -0.1390 0.6161 6.4182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.902466 0.543628 16.376 < 2e-16 ***
## DMSES.Total -0.631903 0.041203 -15.336 < 2e-16 ***
## DK.10 0.019167 0.026675 0.719 0.4727
## age -0.028289 0.005113 -5.533 4.46e-08 ***
## renalSí 0.629150 0.143612 4.381 1.36e-05 ***
## ldl 0.007478 0.001714 4.363 1.48e-05 ***
## bmi -0.016461 0.008569 -1.921 0.0551 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.377 on 693 degrees of freedom
## Multiple R-squared: 0.3538, Adjusted R-squared: 0.3482
## F-statistic: 63.24 on 6 and 693 DF, p-value: < 2.2e-16
plot(mod)
vif(mod)
## DMSES.Total DK.10 age renalSí ldl bmi
## 1.076190 1.051559 1.153334 1.067002 1.021586 1.083572
# eliminamos DK.10
mod <- lm(hba1c ~ DMSES.Total + age + renal + ldl + bmi,data=datos)
(smod <- summary(mod))
##
## Call:
## lm(formula = hba1c ~ DMSES.Total + age + renal + ldl + bmi, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7610 -0.7966 -0.1314 0.5947 6.3716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.064169 0.494685 18.323 < 2e-16 ***
## DMSES.Total -0.633259 0.041145 -15.391 < 2e-16 ***
## age -0.028930 0.005032 -5.749 1.35e-08 ***
## renalSí 0.618438 0.142786 4.331 1.70e-05 ***
## ldl 0.007401 0.001710 4.328 1.72e-05 ***
## bmi -0.016621 0.008564 -1.941 0.0527 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.377 on 694 degrees of freedom
## Multiple R-squared: 0.3533, Adjusted R-squared: 0.3487
## F-statistic: 75.84 on 5 and 694 DF, p-value: < 2.2e-16
plot(mod)
vif(mod)
## DMSES.Total age renalSí ldl bmi
## 1.073931 1.118247 1.055502 1.017605 1.082847
# eliminamos BMI
mod <- lm(hba1c ~ DMSES.Total + age + renal + ldl ,data=datos)
(smod <- summary(mod))
##
## Call:
## lm(formula = hba1c ~ DMSES.Total + age + renal + ldl, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6961 -0.7706 -0.1625 0.5901 6.5332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.454447 0.382888 22.081 < 2e-16 ***
## DMSES.Total -0.626578 0.041082 -15.252 < 2e-16 ***
## age -0.026500 0.004884 -5.426 7.97e-08 ***
## renalSí 0.611629 0.143027 4.276 2.17e-05 ***
## ldl 0.007424 0.001713 4.334 1.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.379 on 695 degrees of freedom
## Multiple R-squared: 0.3498, Adjusted R-squared: 0.3461
## F-statistic: 93.48 on 4 and 695 DF, p-value: < 2.2e-16
plot(mod)
vif(mod)
## DMSES.Total age renalSí ldl
## 1.066413 1.049026 1.054865 1.017554
par(mfrow=c(1,1))
No se consigue tener un R2 elevado ni corregir la no normalidad de los residuos. Aplicamos logaritmos a ‘hba1c’ (variable estrictamente positiva) y se mejora un poco la situación en cuanto a normalidad pero el R2 sigue sin subir. Búsqueda de outliers pendiente.
par(mfrow=c(2,2))
mod <- lm(log(hba1c) ~ DMSES.Total + age + renal + ldl ,data=datos)
(smod <- summary(mod))
##
## Call:
## lm(formula = log(hba1c) ~ DMSES.Total + age + renal + ldl, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.32692 -0.09223 -0.01355 0.08516 0.59417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1135308 0.0470599 44.911 < 2e-16 ***
## DMSES.Total -0.0786385 0.0050494 -15.574 < 2e-16 ***
## age -0.0031851 0.0006003 -5.306 1.51e-07 ***
## renalSí 0.0684713 0.0175791 3.895 0.000108 ***
## ldl 0.0008448 0.0002106 4.012 6.67e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1695 on 695 degrees of freedom
## Multiple R-squared: 0.3504, Adjusted R-squared: 0.3466
## F-statistic: 93.7 on 4 and 695 DF, p-value: < 2.2e-16
plot(mod)
vif(mod)
## DMSES.Total age renalSí ldl
## 1.066413 1.049026 1.054865 1.017554
par(mfrow=c(1,1))
El objetivo de este apartado será comprobar si la puntuación del test DMSES (total o de alguno de sus apartados) puede ayudar a discriminar entre pacientes que controlan la diabetes y los que no.
# cálculo del numero de registros para el training
# según la proporción de registros dedicado al training
pct.train <- 0.90
n <- nrow(datos)
n.train <- floor(n*pct.train)
set.seed(123)
ind.training <- sample(1:n,n.train)
length(ind.training)
## [1] 630
datos.train <- datos[ind.training,]
datos.test <- datos[-ind.training,]
dim(datos.train)
## [1] 630 73
dim(datos.test)
## [1] 70 73
logdiet <- glm(alerta ~ DMSES.Diet, data = datos.train, family = "binomial")
summary(logdiet)
##
## Call:
## glm(formula = alerta ~ DMSES.Diet, family = "binomial", data = datos.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4594 -0.6957 0.3187 0.6533 2.3759
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1710 0.1046 1.636 0.102
## DMSES.Diet -2.8435 0.2103 -13.524 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.53 on 629 degrees of freedom
## Residual deviance: 574.09 on 628 degrees of freedom
## AIC: 578.09
##
## Number of Fisher Scoring iterations: 4
predicciones <- predict(logdiet,newdata=datos.test[,-c(1)],type='response')
predicciones.categ <- ifelse(predicciones > 0.5,1,0)
predicciones.categ <- factor(predicciones.categ,levels=c(0,1),labels=c("No","Sí"))
# Confusion matrix
confusionMatrix(data=predicciones.categ, reference=datos.test$alerta)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 27 4
## Sí 8 31
##
## Accuracy : 0.8286
## 95% CI : (0.7197, 0.9082)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.124e-08
##
## Kappa : 0.6571
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.7714
## Specificity : 0.8857
## Pos Pred Value : 0.8710
## Neg Pred Value : 0.7949
## Prevalence : 0.5000
## Detection Rate : 0.3857
## Detection Prevalence : 0.4429
## Balanced Accuracy : 0.8286
##
## 'Positive' Class : No
##
# ROC and AUC
pr <- prediction(predicciones, datos.test$alerta)
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
(auc <- auc@y.values[[1]])
## [1] 0.9273469
#
#
# hay mejora si añadimos algunos predictores?
logdiet <- glm(alerta ~ DMSES.Diet + bmi + dmdura + age, data = datos.train, family = "binomial")
summary(logdiet)
##
## Call:
## glm(formula = alerta ~ DMSES.Diet + bmi + dmdura + age, family = "binomial",
## data = datos.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4339 -0.6488 0.2882 0.6549 2.3968
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.36401 0.93008 3.617 0.000298 ***
## DMSES.Diet -2.84441 0.21587 -13.177 < 2e-16 ***
## bmi -0.03893 0.01756 -2.217 0.026638 *
## dmdura 0.02090 0.01450 1.441 0.149525
## age -0.03727 0.01145 -3.256 0.001129 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.53 on 629 degrees of freedom
## Residual deviance: 560.64 on 625 degrees of freedom
## AIC: 570.64
##
## Number of Fisher Scoring iterations: 5
predicciones <- predict(logdiet,newdata=datos.test[,-c(1)],type='response')
predicciones.categ <- ifelse(predicciones > 0.5,1,0)
predicciones.categ <- factor(predicciones.categ,levels=c(0,1),labels=c("No","Sí"))
# Confusion matrix
confusionMatrix(data=predicciones.categ, reference=datos.test$alerta)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 28 4
## Sí 7 31
##
## Accuracy : 0.8429
## 95% CI : (0.7362, 0.9189)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.233e-09
##
## Kappa : 0.6857
##
## Mcnemar's Test P-Value : 0.5465
##
## Sensitivity : 0.8000
## Specificity : 0.8857
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.8158
## Prevalence : 0.5000
## Detection Rate : 0.4000
## Detection Prevalence : 0.4571
## Balanced Accuracy : 0.8429
##
## 'Positive' Class : No
##
# ROC and AUC
pr <- prediction(predicciones, datos.test$alerta)
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
(auc <- auc@y.values[[1]])
## [1] 0.9322449
logmonitor <- glm(alerta ~ DMSES.Monitor, data = datos.train, family = "binomial")
summary(logmonitor)
##
## Call:
## glm(formula = alerta ~ DMSES.Monitor, family = "binomial", data = datos.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3135 -0.7761 0.3617 0.7996 2.2797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1367 0.0977 1.399 0.162
## DMSES.Monitor -5.0037 0.4026 -12.430 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.53 on 629 degrees of freedom
## Residual deviance: 637.30 on 628 degrees of freedom
## AIC: 641.3
##
## Number of Fisher Scoring iterations: 4
predicciones <- predict(logmonitor,newdata=datos.test[,-c(1)],type='response')
predicciones.categ <- ifelse(predicciones > 0.5,1,0)
predicciones.categ <- factor(predicciones.categ,levels=c(0,1),labels=c("No","Sí"))
# Confusion matrix
confusionMatrix(data=predicciones.categ, reference=datos.test$alerta)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 29 6
## Sí 6 29
##
## Accuracy : 0.8286
## 95% CI : (0.7197, 0.9082)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 1.124e-08
##
## Kappa : 0.6571
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8286
## Specificity : 0.8286
## Pos Pred Value : 0.8286
## Neg Pred Value : 0.8286
## Prevalence : 0.5000
## Detection Rate : 0.4143
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.8286
##
## 'Positive' Class : No
##
# ROC and AUC
pr <- prediction(predicciones, datos.test$alerta)
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
(auc <- auc@y.values[[1]])
## [1] 0.8653061
logregimen <- glm(alerta ~ DMSES.Regimen, data = datos.train, family = "binomial")
summary(logregimen)
##
## Call:
## glm(formula = alerta ~ DMSES.Regimen, family = "binomial", data = datos.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4598 -1.1805 0.8935 1.1648 1.3501
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.11596 0.08078 1.436 0.15111
## DMSES.Regimen -0.99971 0.34427 -2.904 0.00369 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.53 on 629 degrees of freedom
## Residual deviance: 860.52 on 628 degrees of freedom
## AIC: 864.52
##
## Number of Fisher Scoring iterations: 3
predicciones <- predict(logregimen,newdata=datos.test[,-c(1)],type='response')
predicciones.categ <- ifelse(predicciones > 0.5,1,0)
predicciones.categ <- factor(predicciones.categ,levels=c(0,1),labels=c("No","Sí"))
# Confusion matrix
confusionMatrix(data=predicciones.categ, reference=datos.test$alerta)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 10 0
## Sí 25 35
##
## Accuracy : 0.6429
## 95% CI : (0.5193, 0.7539)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.01123
##
## Kappa : 0.2857
##
## Mcnemar's Test P-Value : 1.587e-06
##
## Sensitivity : 0.2857
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.5833
## Prevalence : 0.5000
## Detection Rate : 0.1429
## Detection Prevalence : 0.1429
## Balanced Accuracy : 0.6429
##
## 'Positive' Class : No
##
# ROC and AUC
pr <- prediction(predicciones, datos.test$alerta)
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
(auc <- auc@y.values[[1]])
## [1] 0.802449
logphysical <- glm(alerta ~ DMSES.Physical, data = datos.train, family = "binomial")
summary(logphysical)
##
## Call:
## glm(formula = alerta ~ DMSES.Physical, family = "binomial", data = datos.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.047 -1.086 0.688 1.023 1.660
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.12349 0.08406 1.469 0.142
## DMSES.Physical -1.53195 0.20506 -7.471 7.98e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 871.53 on 629 degrees of freedom
## Residual deviance: 808.17 on 628 degrees of freedom
## AIC: 812.17
##
## Number of Fisher Scoring iterations: 4
predicciones <- predict(logphysical,newdata=datos.test[,-c(1)],type='response')
predicciones.categ <- ifelse(predicciones > 0.5,1,0)
predicciones.categ <- factor(predicciones.categ,levels=c(0,1),labels=c("No","Sí"))
# Confusion matrix
confusionMatrix(data=predicciones.categ, reference=datos.test$alerta)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Sí
## No 23 9
## Sí 12 26
##
## Accuracy : 0.7
## 95% CI : (0.5787, 0.8038)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0005466
##
## Kappa : 0.4
##
## Mcnemar's Test P-Value : 0.6625206
##
## Sensitivity : 0.6571
## Specificity : 0.7429
## Pos Pred Value : 0.7188
## Neg Pred Value : 0.6842
## Prevalence : 0.5000
## Detection Rate : 0.3286
## Detection Prevalence : 0.4571
## Balanced Accuracy : 0.7000
##
## 'Positive' Class : No
##
# ROC and AUC
pr <- prediction(predicciones, datos.test$alerta)
# TPR = sensitivity, FPR=specificity
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
(auc <- auc@y.values[[1]])
## [1] 0.84
pca <- prcomp(datos[,vars_cont],scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9614 1.3865 1.17696 1.0979 1.06861 0.94293 0.90553
## Proportion of Variance 0.2748 0.1373 0.09895 0.0861 0.08157 0.06351 0.05857
## Cumulative Proportion 0.2748 0.4121 0.51104 0.5971 0.67871 0.74221 0.80078
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.8499 0.77263 0.7010 0.65595 0.62517 0.39650 1.46e-15
## Proportion of Variance 0.0516 0.04264 0.0351 0.03073 0.02792 0.01123 0.00e+00
## Cumulative Proportion 0.8524 0.89502 0.9301 0.96085 0.98877 1.00000 1.00e+00
plot(pca)
pca$rotation[,1:2]
## PC1 PC2
## age 0.120473735 0.40392741
## dmdura -0.003866627 0.40279170
## hba1c -0.342354967 -0.12541985
## ldl -0.048651944 -0.34591508
## hdl 0.004776580 -0.17933358
## trig -0.071672193 -0.28174001
## sbp -0.050084242 -0.36415949
## dbp -0.069185880 -0.48999049
## DMSES.Diet 0.463976598 -0.05343762
## DMSES.Monitor 0.453296921 -0.07260538
## DMSES.Physical 0.368541298 -0.18103332
## DMSES.Regimen 0.217607148 -0.01393538
## DMSES.Total 0.498118566 -0.10601664
## DK.10 -0.037313670 -0.05137537
al <- as.numeric(datos$alerta)
MASS::eqscplot(pca$x[,1], pca$x[,2], pch=c(3,21)[al],col=c("blue","red")[al],
bg="lightblue",xlab="PC1",ylab="PC2")
abline(h=0,v=0,lty=4,col="gray")
legend("topright",pch=c(3,21),col=c("blue","red"), cex=0.8,legend=c("No","Si"))
title(main="Diabetes mas controlada",line=1)
summary(pca)$importance[ ,1:14]
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.961366 1.386494 1.176962 1.097902 1.06861 0.9429273
## Proportion of Variance 0.274780 0.137310 0.098950 0.086100 0.08157 0.0635100
## Cumulative Proportion 0.274780 0.412090 0.511040 0.597140 0.67871 0.7422100
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 0.9055297 0.8499058 0.7726264 0.7010122 0.655951
## Proportion of Variance 0.0585700 0.0516000 0.0426400 0.0351000 0.030730
## Cumulative Proportion 0.8007800 0.8523800 0.8950200 0.9301200 0.960850
## PC12 PC13 PC14
## Standard deviation 0.6251666 0.3964954 1.459512e-15
## Proportion of Variance 0.0279200 0.0112300 0.000000e+00
## Cumulative Proportion 0.9887700 1.0000000 1.000000e+00
library(factoextra)
fviz_eig(pca)
fviz_pca_var(pca,col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE )
El analísis gráfico de componentes principales muestra la contribución de las variables en las dos primeras componentes principales.
library(cluster)
# Fijamos la semilla aleatoria
set.seed(123)
df <- scale(datos[,vars_cont])
res.dist <- get_dist(df, method = "euclidean")
head(round(as.matrix(res.dist), 2))[, 1:5]
## 1 2 3 4 5
## 1 0.00 2.59 5.08 4.82 4.01
## 2 2.59 0.00 4.99 5.42 3.99
## 3 5.08 4.99 0.00 7.55 7.46
## 4 4.82 5.42 7.55 0.00 5.38
## 5 4.01 3.99 7.46 5.38 0.00
## 6 2.46 2.78 6.44 4.40 3.12
# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 5)
# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")
# Visualize
plot(res.hc, cex = 0.5)
# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", nstart = 25, k.max=12)
# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)
# Silhouette plot
fviz_silhouette(res.km)
## cluster size ave.sil.width
## 1 1 375 0.24
## 2 2 325 0.13
# Optimal number of clusters using gap statistics
res.km$nbclust
## [1] 2
# Enhanced k-means clustering (forzando 11 conglomerados)
res.km11 <- eclust(df, FUNcluster = "kmeans", nstart = 25, k = 11)
# Gap statistic plot
fviz_gap_stat(res.km11$gap_stat)
# Silhouette plot
fviz_silhouette(res.km11)
library(cluster)
# Fijamos la semilla aleatoria
set.seed(123)
df <- scale(datos[,c("hba1c","DMSES.Diet","age")])
res.dist <- get_dist(df, method = "euclidean")
head(round(as.matrix(res.dist), 2))[, 1:5]
## 1 2 3 4 5
## 1 0.00 1.58 2.16 2.17 2.43
## 2 1.58 0.00 1.90 1.79 2.47
## 3 2.16 1.90 0.00 3.55 4.07
## 4 2.17 1.79 3.55 0.00 1.82
## 5 2.43 2.47 4.07 1.82 0.00
## 6 1.32 1.13 2.76 0.92 1.70
# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 5)
# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")
# Visualize
plot(res.hc, cex = 0.5)
# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", nstart = 25, k.max=12)
# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)
# Silhouette plot
fviz_silhouette(res.km)
## cluster size ave.sil.width
## 1 1 351 0.28
## 2 2 349 0.44
# Optimal number of clusters using gap statistics
res.km$nbclust
## [1] 2
# Alerta vs Diet
fit.cv <- lda(alerta ~ DMSES.Diet, data=datos, CV=TRUE)
# Assess the accuracy of the prediction
# percent correct for each category of G
ct <- table(datos$alerta, fit.cv$class)
diag(prop.table(ct, 1))
## No Sí
## 0.7957958 0.8283379
prop.table(ct, 1)
##
## No Sí
## No 0.7957958 0.2042042
## Sí 0.1716621 0.8283379
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.8128571
# Alerta vs Diet + age
fit.cv <- lda(alerta ~ DMSES.Diet + age, data=datos, CV=TRUE)
# Assess the accuracy of the prediction
# percent correct for each category of G
ct <- table(datos$alerta, fit.cv$class)
diag(prop.table(ct, 1))
## No Sí
## 0.8018018 0.8337875
prop.table(ct, 1)
##
## No Sí
## No 0.8018018 0.1981982
## Sí 0.1662125 0.8337875
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.8185714
partimat(alerta ~ DMSES.Diet + age, data=datos, method="lda")
# Alerta vs otras variables
fit.cv <- lda(alerta ~ DMSES.Diet + age + ldl + hdl + trig + bmi, data=datos, CV=TRUE)
summary(fit.cv)
## Length Class Mode
## class 700 factor numeric
## posterior 1400 -none- numeric
## terms 3 terms call
## call 4 -none- call
## xlevels 0 -none- list
# Assess the accuracy of the prediction
# percent correct for each category of G
ct <- table(datos$alerta, fit.cv$class)
diag(prop.table(ct, 1))
## No Sí
## 0.8018018 0.8337875
prop.table(ct, 1)
##
## No Sí
## No 0.8018018 0.1981982
## Sí 0.1662125 0.8337875
# total percent correct
sum(diag(prop.table(ct)))
## [1] 0.8185714
prop.table(ct)
##
## No Sí
## No 0.38142857 0.09428571
## Sí 0.08714286 0.43714286
# Analisis discriminante cuadrático
datos.qda <- qda(alerta ~ DMSES.Diet + age, data=datos)
summary(datos.qda)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 4 -none- numeric
## scaling 8 -none- numeric
## ldet 2 -none- numeric
## lev 2 -none- character
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
clq <- predict(datos.qda, datos[,c("DMSES.Diet","age")])$class
ct <- table(datos$alerta, clq)
prop.table(ct)
## clq
## No Sí
## No 0.37285714 0.10285714
## Sí 0.08285714 0.44142857